"""
Code identifies the cord-cutters in the sample and writes out files that can be used in analysis.
Code is written in python 3.

by: Jacob Malone (j.malone@cablelabs.com)
date: August 25, 2021
"""


import pandas as pd

# Define working directory for the data
data_dir = '/mnt/et01/cord_cutting_rep'

# Load the daily file from "02_aggregate_data.py"
print('Loading data....')
dat_file = f'{data_dir}/03_daily_rollup.parquet'
frame = pd.read_parquet(dat_file)

print(f'\nNumber of unique customer keys: {frame["customer_key"].nunique()}')

# Identify people who made change to vid_flag over the sample
print('Calculating vid_flag min/maxes....')
vid_flag_max = frame.groupby('customer_key').vid_flag.max().reset_index()  # Indicator if ever had video
vid_flag_min = frame.groupby('customer_key').vid_flag.min().reset_index()  # Indicator if ever did not have video
vid_flag_max_min = pd.merge(vid_flag_max, vid_flag_min, on='customer_key', suffixes=('_max', '_min'))
# Identify people who did and did not have video at some point over the sample
vid_flag_max_min['made_change'] = ((vid_flag_max_min.vid_flag_max == 1) & (vid_flag_max_min.vid_flag_min == 0))

# Find starting and ending dates for video service
print('Calculating start and end dates for video service....')
dts_w_video = (frame
               .loc[(frame.vid_flag == 1)]
               .groupby('customer_key')
               .date
               .agg(['max', 'min'])
               .reset_index())
dts_w_video.columns = ['customer_key', 'max_dt_w_video', 'min_dt_w_video']

dts_wo_video = (frame
                .loc[(frame.vid_flag == 0)]
                .groupby('customer_key')
                .date
                .agg(['max', 'min'])
                .reset_index())
dts_wo_video.columns = ['customer_key', 'max_dt_wo_video', 'min_dt_wo_video']

print('Identifying cord-cutters....')
video_dates = pd.merge(dts_w_video, dts_wo_video, on='customer_key')  # Merge together with/without video dates
video_dates = pd.merge(video_dates, vid_flag_max_min, on='customer_key')  # Merge in the ever had/did not have video flags
# Identify those who changed video service and the did not have period came after having video
video_dates['cord_cutter'] = ((video_dates.vid_flag_max == 1) &
                              (video_dates.vid_flag_min == 0) &
                              (video_dates.max_dt_w_video < video_dates.min_dt_wo_video))
cord_cutter_ids = video_dates.loc[(video_dates.cord_cutter == True)].customer_key.unique()  # Store list of those who switched

print('Frequency of Cord-Cutters in Sample')
print(video_dates.groupby('cord_cutter').customer_key.nunique())

# Split sample into 2 groups: "cord-cutters" and "not cord-cutters"
print('\nSplitting sample....')
cord_cutters = frame.loc[(frame.customer_key.isin(cord_cutter_ids))].copy()
other_subs = frame.loc[~(frame.customer_key.isin(cord_cutter_ids))].copy()

# Write output files
print('Writing output files....')
out_file = f'{data_dir}/04_cord_cutters.parquet'
cord_cutters.to_parquet(out_file)

out_file = f'{data_dir}/04_other_subs.parquet'
other_subs.to_parquet(out_file)

out_file = f'{data_dir}/04_video_dates.parquet'
video_dates.to_parquet(out_file)
